Exploratory Data Analysis (EDA)

import pandas as pd
import numpy as np

# Load your dataset (update path as needed)
df = pd.read_csv(r"C:\Users\pooja\Desktop\Repositories\ad688-employability-sp25A1-pooja\lightcast_job_postings.csv")
df.head()
ID LAST_UPDATED_DATE LAST_UPDATED_TIMESTAMP DUPLICATES POSTED EXPIRED DURATION SOURCE_TYPES SOURCES URL ... NAICS_2022_2 NAICS_2022_2_NAME NAICS_2022_3 NAICS_2022_3_NAME NAICS_2022_4 NAICS_2022_4_NAME NAICS_2022_5 NAICS_2022_5_NAME NAICS_2022_6 NAICS_2022_6_NAME
0 1f57d95acf4dc67ed2819eb12f049f6a5c11782c 2024-09-06 2024-09-06 20:32:57.352 Z 0 2024-06-02 2024-06-08 6.0 [\n "Company"\n] [\n "brassring.com"\n] [\n "https://sjobs.brassring.com/TGnewUI/Sear... ... 44 Retail Trade 441 Motor Vehicle and Parts Dealers 4413 Automotive Parts, Accessories, and Tire Retailers 44133 Automotive Parts and Accessories Retailers 441330 Automotive Parts and Accessories Retailers
1 0cb072af26757b6c4ea9464472a50a443af681ac 2024-08-02 2024-08-02 17:08:58.838 Z 0 2024-06-02 2024-08-01 NaN [\n "Job Board"\n] [\n "maine.gov"\n] [\n "https://joblink.maine.gov/jobs/1085740"\n] ... 56 Administrative and Support and Waste Managemen... 561 Administrative and Support Services 5613 Employment Services 56132 Temporary Help Services 561320 Temporary Help Services
2 85318b12b3331fa490d32ad014379df01855c557 2024-09-06 2024-09-06 20:32:57.352 Z 1 2024-06-02 2024-07-07 35.0 [\n "Job Board"\n] [\n "dejobs.org"\n] [\n "https://dejobs.org/dallas-tx/data-analys... ... 52 Finance and Insurance 524 Insurance Carriers and Related Activities 5242 Agencies, Brokerages, and Other Insurance Rela... 52429 Other Insurance Related Activities 524291 Claims Adjusting
3 1b5c3941e54a1889ef4f8ae55b401a550708a310 2024-09-06 2024-09-06 20:32:57.352 Z 1 2024-06-02 2024-07-20 48.0 [\n "Job Board"\n] [\n "disabledperson.com",\n "dejobs.org"\n] [\n "https://www.disabledperson.com/jobs/5948... ... 52 Finance and Insurance 522 Credit Intermediation and Related Activities 5221 Depository Credit Intermediation 52211 Commercial Banking 522110 Commercial Banking
4 cb5ca25f02bdf25c13edfede7931508bfd9e858f 2024-06-19 2024-06-19 07:00:00.000 Z 0 2024-06-02 2024-06-17 15.0 [\n "FreeJobBoard"\n] [\n "craigslist.org"\n] [\n "https://modesto.craigslist.org/sls/77475... ... 99 Unclassified Industry 999 Unclassified Industry 9999 Unclassified Industry 99999 Unclassified Industry 999999 Unclassified Industry

5 rows × 131 columns

df.columns.tolist()
['ID',
 'LAST_UPDATED_DATE',
 'LAST_UPDATED_TIMESTAMP',
 'DUPLICATES',
 'POSTED',
 'EXPIRED',
 'DURATION',
 'SOURCE_TYPES',
 'SOURCES',
 'URL',
 'ACTIVE_URLS',
 'ACTIVE_SOURCES_INFO',
 'TITLE_RAW',
 'BODY',
 'MODELED_EXPIRED',
 'MODELED_DURATION',
 'COMPANY',
 'COMPANY_NAME',
 'COMPANY_RAW',
 'COMPANY_IS_STAFFING',
 'EDUCATION_LEVELS',
 'EDUCATION_LEVELS_NAME',
 'MIN_EDULEVELS',
 'MIN_EDULEVELS_NAME',
 'MAX_EDULEVELS',
 'MAX_EDULEVELS_NAME',
 'EMPLOYMENT_TYPE',
 'EMPLOYMENT_TYPE_NAME',
 'MIN_YEARS_EXPERIENCE',
 'MAX_YEARS_EXPERIENCE',
 'IS_INTERNSHIP',
 'SALARY',
 'REMOTE_TYPE',
 'REMOTE_TYPE_NAME',
 'ORIGINAL_PAY_PERIOD',
 'SALARY_TO',
 'SALARY_FROM',
 'LOCATION',
 'CITY',
 'CITY_NAME',
 'COUNTY',
 'COUNTY_NAME',
 'MSA',
 'MSA_NAME',
 'STATE',
 'STATE_NAME',
 'COUNTY_OUTGOING',
 'COUNTY_NAME_OUTGOING',
 'COUNTY_INCOMING',
 'COUNTY_NAME_INCOMING',
 'MSA_OUTGOING',
 'MSA_NAME_OUTGOING',
 'MSA_INCOMING',
 'MSA_NAME_INCOMING',
 'NAICS2',
 'NAICS2_NAME',
 'NAICS3',
 'NAICS3_NAME',
 'NAICS4',
 'NAICS4_NAME',
 'NAICS5',
 'NAICS5_NAME',
 'NAICS6',
 'NAICS6_NAME',
 'TITLE',
 'TITLE_NAME',
 'TITLE_CLEAN',
 'SKILLS',
 'SKILLS_NAME',
 'SPECIALIZED_SKILLS',
 'SPECIALIZED_SKILLS_NAME',
 'CERTIFICATIONS',
 'CERTIFICATIONS_NAME',
 'COMMON_SKILLS',
 'COMMON_SKILLS_NAME',
 'SOFTWARE_SKILLS',
 'SOFTWARE_SKILLS_NAME',
 'ONET',
 'ONET_NAME',
 'ONET_2019',
 'ONET_2019_NAME',
 'CIP6',
 'CIP6_NAME',
 'CIP4',
 'CIP4_NAME',
 'CIP2',
 'CIP2_NAME',
 'SOC_2021_2',
 'SOC_2021_2_NAME',
 'SOC_2021_3',
 'SOC_2021_3_NAME',
 'SOC_2021_4',
 'SOC_2021_4_NAME',
 'SOC_2021_5',
 'SOC_2021_5_NAME',
 'LOT_CAREER_AREA',
 'LOT_CAREER_AREA_NAME',
 'LOT_OCCUPATION',
 'LOT_OCCUPATION_NAME',
 'LOT_SPECIALIZED_OCCUPATION',
 'LOT_SPECIALIZED_OCCUPATION_NAME',
 'LOT_OCCUPATION_GROUP',
 'LOT_OCCUPATION_GROUP_NAME',
 'LOT_V6_SPECIALIZED_OCCUPATION',
 'LOT_V6_SPECIALIZED_OCCUPATION_NAME',
 'LOT_V6_OCCUPATION',
 'LOT_V6_OCCUPATION_NAME',
 'LOT_V6_OCCUPATION_GROUP',
 'LOT_V6_OCCUPATION_GROUP_NAME',
 'LOT_V6_CAREER_AREA',
 'LOT_V6_CAREER_AREA_NAME',
 'SOC_2',
 'SOC_2_NAME',
 'SOC_3',
 'SOC_3_NAME',
 'SOC_4',
 'SOC_4_NAME',
 'SOC_5',
 'SOC_5_NAME',
 'LIGHTCAST_SECTORS',
 'LIGHTCAST_SECTORS_NAME',
 'NAICS_2022_2',
 'NAICS_2022_2_NAME',
 'NAICS_2022_3',
 'NAICS_2022_3_NAME',
 'NAICS_2022_4',
 'NAICS_2022_4_NAME',
 'NAICS_2022_5',
 'NAICS_2022_5_NAME',
 'NAICS_2022_6',
 'NAICS_2022_6_NAME']
print(df.shape)
(72476, 131)
print(df.head())
                                         ID LAST_UPDATED_DATE  \
0  1f57d95acf4dc67ed2819eb12f049f6a5c11782c        2024-09-06   
1  0cb072af26757b6c4ea9464472a50a443af681ac        2024-08-02   
2  85318b12b3331fa490d32ad014379df01855c557        2024-09-06   
3  1b5c3941e54a1889ef4f8ae55b401a550708a310        2024-09-06   
4  cb5ca25f02bdf25c13edfede7931508bfd9e858f        2024-06-19   

      LAST_UPDATED_TIMESTAMP  DUPLICATES      POSTED     EXPIRED  DURATION  \
0  2024-09-06 20:32:57.352 Z           0  2024-06-02  2024-06-08       6.0   
1  2024-08-02 17:08:58.838 Z           0  2024-06-02  2024-08-01       NaN   
2  2024-09-06 20:32:57.352 Z           1  2024-06-02  2024-07-07      35.0   
3  2024-09-06 20:32:57.352 Z           1  2024-06-02  2024-07-20      48.0   
4  2024-06-19 07:00:00.000 Z           0  2024-06-02  2024-06-17      15.0   

             SOURCE_TYPES                                        SOURCES  \
0       [\n  "Company"\n]                        [\n  "brassring.com"\n]   
1     [\n  "Job Board"\n]                            [\n  "maine.gov"\n]   
2     [\n  "Job Board"\n]                           [\n  "dejobs.org"\n]   
3     [\n  "Job Board"\n]  [\n  "disabledperson.com",\n  "dejobs.org"\n]   
4  [\n  "FreeJobBoard"\n]                       [\n  "craigslist.org"\n]   

                                                 URL  ... NAICS_2022_2  \
0  [\n  "https://sjobs.brassring.com/TGnewUI/Sear...  ...           44   
1   [\n  "https://joblink.maine.gov/jobs/1085740"\n]  ...           56   
2  [\n  "https://dejobs.org/dallas-tx/data-analys...  ...           52   
3  [\n  "https://www.disabledperson.com/jobs/5948...  ...           52   
4  [\n  "https://modesto.craigslist.org/sls/77475...  ...           99   

                                   NAICS_2022_2_NAME NAICS_2022_3  \
0                                       Retail Trade          441   
1  Administrative and Support and Waste Managemen...          561   
2                              Finance and Insurance          524   
3                              Finance and Insurance          522   
4                              Unclassified Industry          999   

                              NAICS_2022_3_NAME NAICS_2022_4  \
0               Motor Vehicle and Parts Dealers         4413   
1           Administrative and Support Services         5613   
2     Insurance Carriers and Related Activities         5242   
3  Credit Intermediation and Related Activities         5221   
4                         Unclassified Industry         9999   

                                   NAICS_2022_4_NAME  NAICS_2022_5  \
0  Automotive Parts, Accessories, and Tire Retailers         44133   
1                                Employment Services         56132   
2  Agencies, Brokerages, and Other Insurance Rela...         52429   
3                   Depository Credit Intermediation         52211   
4                              Unclassified Industry         99999   

                            NAICS_2022_5_NAME NAICS_2022_6  \
0  Automotive Parts and Accessories Retailers       441330   
1                     Temporary Help Services       561320   
2          Other Insurance Related Activities       524291   
3                          Commercial Banking       522110   
4                       Unclassified Industry       999999   

                            NAICS_2022_6_NAME  
0  Automotive Parts and Accessories Retailers  
1                     Temporary Help Services  
2                            Claims Adjusting  
3                          Commercial Banking  
4                       Unclassified Industry  

[5 rows x 131 columns]
print(df.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72476 entries, 0 to 72475
Columns: 131 entries, ID to NAICS_2022_6_NAME
dtypes: bool(2), float64(11), int64(27), object(91)
memory usage: 71.5+ MB
None
df.isnull().sum().to_frame()
0
ID 0
LAST_UPDATED_DATE 0
LAST_UPDATED_TIMESTAMP 0
DUPLICATES 0
POSTED 0
... ...
NAICS_2022_4_NAME 0
NAICS_2022_5 0
NAICS_2022_5_NAME 0
NAICS_2022_6 0
NAICS_2022_6_NAME 0

131 rows × 1 columns

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(16,6))
sns.heatmap(df.isnull(), cbar=False)
plt.title("Missing Values Heatmap")
plt.show()

missing_counts = df.isnull().sum().sort_values(ascending=False)
print(missing_counts[missing_counts > 0])
ACTIVE_SOURCES_INFO       64654
MAX_YEARS_EXPERIENCE      64046
MAX_EDULEVELS_NAME        56155
MAX_EDULEVELS             56155
LIGHTCAST_SECTORS         54682
LIGHTCAST_SECTORS_NAME    54682
SALARY                    41658
SALARY_FROM               40068
SALARY_TO                 40068
ORIGINAL_PAY_PERIOD       40068
DURATION                  27294
MIN_YEARS_EXPERIENCE      23113
MODELED_DURATION          19261
MODELED_EXPIRED           15383
EXPIRED                    7822
MSA_INCOMING               3921
MSA_NAME_INCOMING          3921
MSA_NAME                   3908
MSA                        3908
MSA_NAME_OUTGOING          3908
MSA_OUTGOING               3908
COMPANY_RAW                 497
TITLE_CLEAN                  96
TITLE_RAW                    60
dtype: int64
thresh = 0.7 * len(df)
df = df.loc[:, df.isnull().sum() < thresh]
threshold = 0.7 * len(df)
df = df.loc[:, df.isnull().sum() < threshold]
df['SALARY'] = df['SALARY'].fillna(df['SALARY'].median())
df['MIN_YEARS_EXPERIENCE'] = df['MIN_YEARS_EXPERIENCE'].fillna(df['MIN_YEARS_EXPERIENCE'].median())
df['DURATION'] = df['DURATION'].fillna(df['DURATION'].median())
df['COMPANY_RAW'] = df['COMPANY_RAW'].fillna('Unknown')
df['TITLE_CLEAN'] = df['TITLE_CLEAN'].fillna('Unknown')
df['TITLE_RAW'] = df['TITLE_RAW'].fillna('Unknown')
df = df.dropna(subset=['SALARY', 'TITLE_CLEAN'])  # Only if these are critical for your analysis
print(df.isnull().sum()[df.isnull().sum() > 0])
EXPIRED                 7822
MODELED_EXPIRED        15383
MODELED_DURATION       19261
ORIGINAL_PAY_PERIOD    40068
SALARY_TO              40068
SALARY_FROM            40068
MSA                     3908
MSA_NAME                3908
MSA_OUTGOING            3908
MSA_NAME_OUTGOING       3908
MSA_INCOMING            3921
MSA_NAME_INCOMING       3921
dtype: int64
df['SALARY_TO'] = df['SALARY_TO'].fillna(df['SALARY_TO'].median())
df['SALARY_FROM'] = df['SALARY_FROM'].fillna(df['SALARY_FROM'].median())
df['ORIGINAL_PAY_PERIOD'] = df['ORIGINAL_PAY_PERIOD'].fillna('Unknown')
df['EXPIRED'] = df['EXPIRED'].fillna('Unknown')
df['MODELED_EXPIRED'] = df['MODELED_EXPIRED'].fillna('Unknown')
df['MSA'] = df['MSA'].fillna('Unknown')
df['MSA_NAME'] = df['MSA_NAME'].fillna('Unknown')
df['MSA_OUTGOING'] = df['MSA_OUTGOING'].fillna('Unknown')
df['MSA_NAME_OUTGOING'] = df['MSA_NAME_OUTGOING'].fillna('Unknown')
df['MSA_INCOMING'] = df['MSA_INCOMING'].fillna('Unknown')
df['MSA_NAME_INCOMING'] = df['MSA_NAME_INCOMING'].fillna('Unknown')
df['MODELED_DURATION'] = df['MODELED_DURATION'].fillna(df['MODELED_DURATION'].median())
print(df.isnull().sum()[df.isnull().sum() > 0])
Series([], dtype: int64)
print(df.shape)
print(df.columns)
print(df.dtypes)
print(df.head())
(72476, 125)
Index(['ID', 'LAST_UPDATED_DATE', 'LAST_UPDATED_TIMESTAMP', 'DUPLICATES',
       'POSTED', 'EXPIRED', 'DURATION', 'SOURCE_TYPES', 'SOURCES', 'URL',
       ...
       'NAICS_2022_2', 'NAICS_2022_2_NAME', 'NAICS_2022_3',
       'NAICS_2022_3_NAME', 'NAICS_2022_4', 'NAICS_2022_4_NAME',
       'NAICS_2022_5', 'NAICS_2022_5_NAME', 'NAICS_2022_6',
       'NAICS_2022_6_NAME'],
      dtype='object', length=125)
ID                        object
LAST_UPDATED_DATE         object
LAST_UPDATED_TIMESTAMP    object
DUPLICATES                 int64
POSTED                    object
                           ...  
NAICS_2022_4_NAME         object
NAICS_2022_5               int64
NAICS_2022_5_NAME         object
NAICS_2022_6               int64
NAICS_2022_6_NAME         object
Length: 125, dtype: object
                                         ID LAST_UPDATED_DATE  \
0  1f57d95acf4dc67ed2819eb12f049f6a5c11782c        2024-09-06   
1  0cb072af26757b6c4ea9464472a50a443af681ac        2024-08-02   
2  85318b12b3331fa490d32ad014379df01855c557        2024-09-06   
3  1b5c3941e54a1889ef4f8ae55b401a550708a310        2024-09-06   
4  cb5ca25f02bdf25c13edfede7931508bfd9e858f        2024-06-19   

      LAST_UPDATED_TIMESTAMP  DUPLICATES      POSTED     EXPIRED  DURATION  \
0  2024-09-06 20:32:57.352 Z           0  2024-06-02  2024-06-08       6.0   
1  2024-08-02 17:08:58.838 Z           0  2024-06-02  2024-08-01      18.0   
2  2024-09-06 20:32:57.352 Z           1  2024-06-02  2024-07-07      35.0   
3  2024-09-06 20:32:57.352 Z           1  2024-06-02  2024-07-20      48.0   
4  2024-06-19 07:00:00.000 Z           0  2024-06-02  2024-06-17      15.0   

             SOURCE_TYPES                                        SOURCES  \
0       [\n  "Company"\n]                        [\n  "brassring.com"\n]   
1     [\n  "Job Board"\n]                            [\n  "maine.gov"\n]   
2     [\n  "Job Board"\n]                           [\n  "dejobs.org"\n]   
3     [\n  "Job Board"\n]  [\n  "disabledperson.com",\n  "dejobs.org"\n]   
4  [\n  "FreeJobBoard"\n]                       [\n  "craigslist.org"\n]   

                                                 URL  ... NAICS_2022_2  \
0  [\n  "https://sjobs.brassring.com/TGnewUI/Sear...  ...           44   
1   [\n  "https://joblink.maine.gov/jobs/1085740"\n]  ...           56   
2  [\n  "https://dejobs.org/dallas-tx/data-analys...  ...           52   
3  [\n  "https://www.disabledperson.com/jobs/5948...  ...           52   
4  [\n  "https://modesto.craigslist.org/sls/77475...  ...           99   

                                   NAICS_2022_2_NAME NAICS_2022_3  \
0                                       Retail Trade          441   
1  Administrative and Support and Waste Managemen...          561   
2                              Finance and Insurance          524   
3                              Finance and Insurance          522   
4                              Unclassified Industry          999   

                              NAICS_2022_3_NAME  NAICS_2022_4  \
0               Motor Vehicle and Parts Dealers          4413   
1           Administrative and Support Services          5613   
2     Insurance Carriers and Related Activities          5242   
3  Credit Intermediation and Related Activities          5221   
4                         Unclassified Industry          9999   

                                   NAICS_2022_4_NAME NAICS_2022_5  \
0  Automotive Parts, Accessories, and Tire Retailers        44133   
1                                Employment Services        56132   
2  Agencies, Brokerages, and Other Insurance Rela...        52429   
3                   Depository Credit Intermediation        52211   
4                              Unclassified Industry        99999   

                            NAICS_2022_5_NAME  NAICS_2022_6  \
0  Automotive Parts and Accessories Retailers        441330   
1                     Temporary Help Services        561320   
2          Other Insurance Related Activities        524291   
3                          Commercial Banking        522110   
4                       Unclassified Industry        999999   

                            NAICS_2022_6_NAME  
0  Automotive Parts and Accessories Retailers  
1                     Temporary Help Services  
2                            Claims Adjusting  
3                          Commercial Banking  
4                       Unclassified Industry  

[5 rows x 125 columns]
print(df.describe(include='all'))
                                              ID LAST_UPDATED_DATE  \
count                                      72476             72476   
unique                                     72476               169   
top     733c12969489de888093ef22d09204dc0945148a        2024-10-09   
freq                                           1             22304   
mean                                         NaN               NaN   
std                                          NaN               NaN   
min                                          NaN               NaN   
25%                                          NaN               NaN   
50%                                          NaN               NaN   
75%                                          NaN               NaN   
max                                          NaN               NaN   

           LAST_UPDATED_TIMESTAMP    DUPLICATES      POSTED  EXPIRED  \
count                       72476  72476.000000       72476    72476   
unique                        174           NaN         153      178   
top     2024-10-09 18:07:44.758 Z           NaN  2024-05-24  Unknown   
freq                        19878           NaN        1050     7822   
mean                          NaN      1.081627         NaN      NaN   
std                           NaN      2.807512         NaN      NaN   
min                           NaN      0.000000         NaN      NaN   
25%                           NaN      0.000000         NaN      NaN   
50%                           NaN      0.000000         NaN      NaN   
75%                           NaN      1.000000         NaN      NaN   
max                           NaN    100.000000         NaN      NaN   

            DURATION         SOURCE_TYPES             SOURCES  \
count   72476.000000                72476               72476   
unique           NaN                  175               12890   
top              NaN  [\n  "Job Board"\n]  [\n  "dice.com"\n]   
freq             NaN                45182               10596   
mean       20.694796                  NaN                 NaN   
std        11.529174                  NaN                 NaN   
min         0.000000                  NaN                 NaN   
25%        15.000000                  NaN                 NaN   
50%        18.000000                  NaN                 NaN   
75%        23.000000                  NaN                 NaN   
max        59.000000                  NaN                 NaN   

                                                      URL  ...  NAICS_2022_2  \
count                                               72476  ...  72476.000000   
unique                                              72345  ...           NaN   
top     [\n  "https://www2.jobdiva.com/portal/?a=u4jdn...  ...           NaN   
freq                                                   29  ...           NaN   
mean                                                  NaN  ...     58.352448   
std                                                   NaN  ...     18.628253   
min                                                   NaN  ...     11.000000   
25%                                                   NaN  ...     52.000000   
50%                                                   NaN  ...     54.000000   
75%                                                   NaN  ...     56.000000   
max                                                   NaN  ...     99.000000   

                                       NAICS_2022_2_NAME  NAICS_2022_3  \
count                                              72476  72476.000000   
unique                                                21           NaN   
top     Professional, Scientific, and Technical Services           NaN   
freq                                               23318           NaN   
mean                                                 NaN    587.864245   
std                                                  NaN    186.277378   
min                                                  NaN    111.000000   
25%                                                  NaN    522.000000   
50%                                                  NaN    541.000000   
75%                                                  NaN    561.000000   
max                                                  NaN    999.000000   

                                       NAICS_2022_3_NAME  NAICS_2022_4  \
count                                              72476  72476.000000   
unique                                                97           NaN   
top     Professional, Scientific, and Technical Services           NaN   
freq                                               23318           NaN   
mean                                                 NaN   5883.118674   
std                                                  NaN   1864.277209   
min                                                  NaN   1111.000000   
25%                                                  NaN   5223.000000   
50%                                                  NaN   5415.000000   
75%                                                  NaN   5614.000000   
max                                                  NaN   9999.000000   

                                   NAICS_2022_4_NAME  NAICS_2022_5  \
count                                          72476  72476.000000   
unique                                           294           NaN   
top     Computer Systems Design and Related Services           NaN   
freq                                           10862           NaN   
mean                                             NaN  58834.284053   
std                                              NaN  18644.805144   
min                                              NaN  11115.000000   
25%                                              NaN  52231.750000   
50%                                              NaN  54151.000000   
75%                                              NaN  56149.000000   
max                                              NaN  99999.000000   

                                   NAICS_2022_5_NAME   NAICS_2022_6  \
count                                          72476   72476.000000   
unique                                           600            NaN   
top     Computer Systems Design and Related Services            NaN   
freq                                           10862            NaN   
mean                                             NaN  588345.353165   
std                                              NaN  186450.077502   
min                                              NaN  111150.000000   
25%                                              NaN  522317.500000   
50%                                              NaN  541519.000000   
75%                                              NaN  561499.000000   
max                                              NaN  999999.000000   

            NAICS_2022_6_NAME  
count                   72476  
unique                    814  
top     Unclassified Industry  
freq                     9493  
mean                      NaN  
std                       NaN  
min                       NaN  
25%                       NaN  
50%                       NaN  
75%                       NaN  
max                       NaN  

[11 rows x 125 columns]
import matplotlib.pyplot as plt
df['SALARY_FROM'].hist(bins=50)
plt.title('Distribution of Minimum Salaries')
plt.xlabel('SALARY_FROM')
plt.ylabel('Count')
plt.show()

print(df['STATE_NAME'].value_counts())
print(df['EMPLOYMENT_TYPE_NAME'].value_counts())
STATE_NAME
Texas                                      8067
California                                 7087
Florida                                    3645
Virginia                                   3636
Illinois                                   3539
New York                                   3341
North Carolina                             2747
Georgia                                    2658
Ohio                                       2627
New Jersey                                 2614
Pennsylvania                               2254
Massachusetts                              2057
Michigan                                   1838
Arizona                                    1638
Washington                                 1626
Minnesota                                  1476
Colorado                                   1455
Maryland                                   1360
Tennessee                                  1274
Missouri                                   1232
Washington, D.C. (District of Columbia)    1224
Oregon                                     1090
Wisconsin                                  1050
Indiana                                     956
Connecticut                                 863
Kansas                                      740
Alabama                                     690
South Carolina                              647
Utah                                        643
Kentucky                                    635
Iowa                                        625
Oklahoma                                    606
Nevada                                      591
Arkansas                                    584
Nebraska                                    540
Idaho                                       478
Mississippi                                 471
Rhode Island                                459
Louisiana                                   456
Delaware                                    438
Maine                                       345
New Hampshire                               296
South Dakota                                295
Hawaii                                      263
New Mexico                                  255
Alaska                                      236
Vermont                                     227
Montana                                     186
West Virginia                               159
North Dakota                                149
Wyoming                                     108
Name: count, dtype: int64
EMPLOYMENT_TYPE_NAME
Full-time (> 32 hours)    69196
Part-time (≤ 32 hours)     2298
Part-time / full-time       982
Name: count, dtype: int64
print(df.groupby('STATE_NAME')['SALARY_FROM'].mean())
print(df.groupby('EMPLOYMENT_TYPE_NAME')['SALARY_FROM'].mean())
STATE_NAME
Alabama                                    88279.007246
Alaska                                     84794.105932
Arizona                                    87683.371184
Arkansas                                   89840.611301
California                                 95423.065613
Colorado                                   90154.406873
Connecticut                                92808.732329
Delaware                                   90662.648402
Florida                                    89149.689712
Georgia                                    90026.200903
Hawaii                                     90106.038023
Idaho                                      89572.702929
Illinois                                   91415.218988
Indiana                                    88694.947699
Iowa                                       89180.659200
Kansas                                     87888.768919
Kentucky                                   86203.930709
Louisiana                                  88461.322368
Maine                                      87973.060870
Maryland                                   90777.826471
Massachusetts                              90503.149733
Michigan                                   90050.809576
Minnesota                                  90077.165312
Mississippi                                88169.420382
Missouri                                   88634.789773
Montana                                    92105.451613
Nebraska                                   89942.440741
Nevada                                     85886.473773
New Hampshire                              87656.047297
New Jersey                                 93554.228386
New Mexico                                 82922.105882
New York                                   92826.572882
North Carolina                             90884.643975
North Dakota                               85462.389262
Ohio                                       87940.357442
Oklahoma                                   88058.201320
Oregon                                     90852.360550
Pennsylvania                               89500.943212
Rhode Island                               89953.023965
South Carolina                             89081.479134
South Dakota                               85602.905085
Tennessee                                  87728.423862
Texas                                      89990.226602
Utah                                       87489.544323
Vermont                                    93845.889868
Virginia                                   92390.495325
Washington                                 94471.674662
Washington, D.C. (District of Columbia)    92732.987745
West Virginia                              85515.301887
Wisconsin                                  88277.826667
Wyoming                                    88958.398148
Name: SALARY_FROM, dtype: float64
EMPLOYMENT_TYPE_NAME
Full-time (> 32 hours)    91112.257659
Part-time (≤ 32 hours)    80780.870757
Part-time / full-time     83786.090631
Name: SALARY_FROM, dtype: float64
df.columns.tolist()
['ID',
 'LAST_UPDATED_DATE',
 'LAST_UPDATED_TIMESTAMP',
 'DUPLICATES',
 'POSTED',
 'EXPIRED',
 'DURATION',
 'SOURCE_TYPES',
 'SOURCES',
 'URL',
 'ACTIVE_URLS',
 'TITLE_RAW',
 'BODY',
 'MODELED_EXPIRED',
 'MODELED_DURATION',
 'COMPANY',
 'COMPANY_NAME',
 'COMPANY_RAW',
 'COMPANY_IS_STAFFING',
 'EDUCATION_LEVELS',
 'EDUCATION_LEVELS_NAME',
 'MIN_EDULEVELS',
 'MIN_EDULEVELS_NAME',
 'EMPLOYMENT_TYPE',
 'EMPLOYMENT_TYPE_NAME',
 'MIN_YEARS_EXPERIENCE',
 'IS_INTERNSHIP',
 'SALARY',
 'REMOTE_TYPE',
 'REMOTE_TYPE_NAME',
 'ORIGINAL_PAY_PERIOD',
 'SALARY_TO',
 'SALARY_FROM',
 'LOCATION',
 'CITY',
 'CITY_NAME',
 'COUNTY',
 'COUNTY_NAME',
 'MSA',
 'MSA_NAME',
 'STATE',
 'STATE_NAME',
 'COUNTY_OUTGOING',
 'COUNTY_NAME_OUTGOING',
 'COUNTY_INCOMING',
 'COUNTY_NAME_INCOMING',
 'MSA_OUTGOING',
 'MSA_NAME_OUTGOING',
 'MSA_INCOMING',
 'MSA_NAME_INCOMING',
 'NAICS2',
 'NAICS2_NAME',
 'NAICS3',
 'NAICS3_NAME',
 'NAICS4',
 'NAICS4_NAME',
 'NAICS5',
 'NAICS5_NAME',
 'NAICS6',
 'NAICS6_NAME',
 'TITLE',
 'TITLE_NAME',
 'TITLE_CLEAN',
 'SKILLS',
 'SKILLS_NAME',
 'SPECIALIZED_SKILLS',
 'SPECIALIZED_SKILLS_NAME',
 'CERTIFICATIONS',
 'CERTIFICATIONS_NAME',
 'COMMON_SKILLS',
 'COMMON_SKILLS_NAME',
 'SOFTWARE_SKILLS',
 'SOFTWARE_SKILLS_NAME',
 'ONET',
 'ONET_NAME',
 'ONET_2019',
 'ONET_2019_NAME',
 'CIP6',
 'CIP6_NAME',
 'CIP4',
 'CIP4_NAME',
 'CIP2',
 'CIP2_NAME',
 'SOC_2021_2',
 'SOC_2021_2_NAME',
 'SOC_2021_3',
 'SOC_2021_3_NAME',
 'SOC_2021_4',
 'SOC_2021_4_NAME',
 'SOC_2021_5',
 'SOC_2021_5_NAME',
 'LOT_CAREER_AREA',
 'LOT_CAREER_AREA_NAME',
 'LOT_OCCUPATION',
 'LOT_OCCUPATION_NAME',
 'LOT_SPECIALIZED_OCCUPATION',
 'LOT_SPECIALIZED_OCCUPATION_NAME',
 'LOT_OCCUPATION_GROUP',
 'LOT_OCCUPATION_GROUP_NAME',
 'LOT_V6_SPECIALIZED_OCCUPATION',
 'LOT_V6_SPECIALIZED_OCCUPATION_NAME',
 'LOT_V6_OCCUPATION',
 'LOT_V6_OCCUPATION_NAME',
 'LOT_V6_OCCUPATION_GROUP',
 'LOT_V6_OCCUPATION_GROUP_NAME',
 'LOT_V6_CAREER_AREA',
 'LOT_V6_CAREER_AREA_NAME',
 'SOC_2',
 'SOC_2_NAME',
 'SOC_3',
 'SOC_3_NAME',
 'SOC_4',
 'SOC_4_NAME',
 'SOC_5',
 'SOC_5_NAME',
 'NAICS_2022_2',
 'NAICS_2022_2_NAME',
 'NAICS_2022_3',
 'NAICS_2022_3_NAME',
 'NAICS_2022_4',
 'NAICS_2022_4_NAME',
 'NAICS_2022_5',
 'NAICS_2022_5_NAME',
 'NAICS_2022_6',
 'NAICS_2022_6_NAME']
df.shape
(72476, 125)

#1. Salary Distribution: AI vs. Non-AI Careers (right)

ai_keywords = ['AI', 'Artificial Intelligence', 'Machine Learning', 'Deep Learning', 'Data Scientist', 'NLP', 'Computer Vision']
df['IS_AI_CAREER'] = df['TITLE_CLEAN'].str.contains('|'.join(ai_keywords), case=False, na=False).astype(int)
plt.figure(figsize=(10,6))
sns.histplot(data=df, x='SALARY', hue='IS_AI_CAREER', kde=True, bins=40)
plt.title('Salary Distribution: AI vs. Non-AI Careers')
plt.xlabel('Salary')
plt.ylabel('Number of Job Postings')
plt.show()

!pip3 install plotly
Requirement already satisfied: plotly in c:\python312\lib\site-packages (6.0.1)
Requirement already satisfied: narwhals>=1.15.1 in c:\python312\lib\site-packages (from plotly) (1.37.1)
Requirement already satisfied: packaging in c:\users\pooja\appdata\roaming\python\python312\site-packages (from plotly) (24.1)

[notice] A new release of pip is available: 24.0 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip
import plotly.express as px

fig = px.histogram(
    df,
    x='SALARY',
    color='IS_AI_CAREER',  # hue equivalent
    nbins=40,
    marginal='rug',         # optional: shows distribution marks on axis
    opacity=0.7,
    barmode='overlay',      # bars overlap like in Seaborn with hue
    title='Salary Distribution: AI vs. Non-AI Careers',
    labels={'SALARY': 'Salary', 'IS_AI_CAREER': 'AI Career?'}
)

fig.update_layout(
    xaxis_title='Salary',
    yaxis_title='Number of Job Postings',
    bargap=0.05
)

fig.show()

2. Boxplot Salary Distribution: AI vs. Non-AI Careers

import plotly.express as px

fig = px.box(
    df,
    x='IS_AI_CAREER',
    y='SALARY',
    color='IS_AI_CAREER',
    title='Salary Distribution: AI vs. Non-AI Careers',
    labels={
        'IS_AI_CAREER': 'AI Career (1 = AI, 0 = Non-AI)',
        'SALARY': 'Salary'
    },
    width=700,
    height=500
)

fig.update_layout(
    xaxis=dict(
        tickmode='array',
        tickvals=[0, 1],
        ticktext=['Non-AI', 'AI']
    ),
    showlegend=False
)

fig.show()

Average Salary by Industry (2024)

import plotly.express as px

# Filter and group the data
industry_salary_2024 = (
    df[df['POSTED'].astype(str).str.contains('2024')]  # Ensure POSTED is string
    .groupby('NAICS2_NAME')['SALARY']
    .mean()
    .sort_values(ascending=False)
    .head(15)
    .reset_index()
)

# Create Plotly bar chart
fig = px.bar(
    industry_salary_2024,
    x='NAICS2_NAME',
    y='SALARY',
    title='Average Salary by Industry (2024)',
    labels={'NAICS2_NAME': 'Industry', 'SALARY': 'Average Salary'},
    height=600
)

fig.update_layout(
    xaxis_tickangle=45
)

fig.show()

Job Counts by State (AI vs. Non-AI)

import plotly.express as px

# Group and count jobs by state and AI category
region_counts = df.groupby(['STATE_NAME', 'IS_AI_CAREER']).size().reset_index(name='count')

# Create grouped bar chart
fig = px.bar(
    region_counts,
    x='STATE_NAME',
    y='count',
    color='IS_AI_CAREER',
    barmode='group',
    title='Job Counts: AI vs. Non-AI by State',
    labels={
        'STATE_NAME': 'State',
        'count': 'Job Count',
        'IS_AI_CAREER': 'AI Career (1 = AI, 0 = Non-AI)'
    },
    height=600,
    width=1000
)

fig.update_layout(
    xaxis_tickangle=90
)

fig.show()

Proportion of Remote Jobs in AI vs. Non-AI

import plotly.express as px

# Group by AI status and remote type
remote_counts = df.groupby(['IS_AI_CAREER', 'REMOTE_TYPE_NAME']).size().reset_index(name='count')

# Create interactive grouped bar chart
fig = px.bar(
    remote_counts,
    x='IS_AI_CAREER',
    y='count',
    color='REMOTE_TYPE_NAME',
    barmode='group',
    title='Remote/In-Office Proportion: AI vs. Non-AI Jobs',
    labels={
        'IS_AI_CAREER': 'AI Career (1 = Yes, 0 = No)',
        'count': 'Job Count',
        'REMOTE_TYPE_NAME': 'Remote Type'
    },
    height=500,
    width=800
)

fig.update_layout(xaxis=dict(tickmode='array', tickvals=[0, 1], ticktext=['Non-AI', 'AI']))
fig.show()

Correlation Matrix

# Select numeric columns only
df_numeric = df.select_dtypes(include=[np.number])
correlation_matrix = df_numeric.corr()
import plotly.express as px

fig = px.imshow(
    correlation_matrix,
    text_auto=True,
    color_continuous_scale='RdBu_r',
    title='Correlation Matrix (Numeric Features)'
)

fig.update_layout(height=700, width=800)
fig.show()

Top 10 AI Job Titles by Average Salary

import plotly.express as px

# Prepare data: top 10 AI job titles by average salary
ai_jobs = (
    df[df['IS_AI_CAREER'] == 1]
    .groupby('TITLE_CLEAN')['SALARY']
    .mean()
    .sort_values(ascending=False)
    .head(10)
    .reset_index()
)

# Create bar plot
fig = px.bar(
    ai_jobs,
    x='TITLE_CLEAN',
    y='SALARY',
    title='Top 10 AI Job Titles by Average Salary',
    labels={'TITLE_CLEAN': 'Job Title', 'SALARY': 'Average Salary'},
    height=500
)

fig.update_layout(
    xaxis_tickangle=45
)

fig.show()

4. Remote vs. In-Office Salary Comparison

import plotly.express as px

# Filter data to remove missing salary or remote type
df_box = df.dropna(subset=['SALARY', 'REMOTE_TYPE_NAME'])

# Create Plotly boxplot
fig = px.box(
    df_box,
    x='REMOTE_TYPE_NAME',
    y='SALARY',
    title='Salary Distribution: Remote vs. In-Office Jobs',
    labels={'REMOTE_TYPE_NAME': 'Remote Type', 'SALARY': 'Salary'},
    color='REMOTE_TYPE_NAME',  # optional: color by category
    color_discrete_sequence=px.colors.qualitative.Set2,  # nice pastel palette
    height=800
)

fig.update_layout(
    template='plotly_white'
)

fig.show()

7. AI vs. Non-AI Job Count by Region

import plotly.express as px

# Step 1: Prepare grouped job count data (already done)
region_counts = df.groupby(['STATE_NAME', 'IS_AI_CAREER']).size().reset_index(name='count')
region_counts['Career Type'] = region_counts['IS_AI_CAREER'].map({0: 'Non-AI', 1: 'AI'})

# Step 2: Create grouped bar chart with Plotly
fig = px.bar(
    region_counts,
    x='STATE_NAME',
    y='count',
    color='Career Type',
    barmode='group',
    title='Job Counts: AI vs. Non-AI by State',
    labels={'STATE_NAME': 'State', 'count': 'Job Count'},
    height=600,
    width=1000,
    color_discrete_map={'AI': 'tomato', 'Non-AI': 'steelblue'}
)

# Step 3: Format the layout
fig.update_layout(
    xaxis_tickangle=60,
    template='plotly_white'
)

fig.show()

10. Heatmap: Salary Across States and Years (for AI Careers)

import plotly.express as px

# Create the pivot table (same as in your code)
pivot = df[df['IS_AI_CAREER'] == 1].pivot_table(
    index='STATE_NAME',
    columns='YEAR',
    values='SALARY',
    aggfunc='mean'
).reset_index()

# Melt to long format for Plotly
pivot_melted = pivot.melt(id_vars='STATE_NAME', var_name='YEAR', value_name='Average_Salary')

# Create heatmap
fig = px.density_heatmap(
    pivot_melted,
    x='YEAR',
    y='STATE_NAME',
    z='Average_Salary',
    color_continuous_scale='Viridis',
    title='Heatmap of Average AI Salaries by State and Year',
    labels={'Average_Salary': 'Avg Salary'}
)

fig.update_layout(height=600, width=1000)
fig.show()
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[123], line 4
      1 import plotly.express as px
      3 # Create the pivot table (same as in your code)
----> 4 pivot = df[df['IS_AI_CAREER'] == 1].pivot_table(
      5     index='STATE_NAME',
      6     columns='YEAR',
      7     values='SALARY',
      8     aggfunc='mean'
      9 ).reset_index()
     11 # Melt to long format for Plotly
     12 pivot_melted = pivot.melt(id_vars='STATE_NAME', var_name='YEAR', value_name='Average_Salary')

File c:\Python312\Lib\site-packages\pandas\core\frame.py:9509, in DataFrame.pivot_table(self, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name, observed, sort)
   9492 @Substitution("")
   9493 @Appender(_shared_docs["pivot_table"])
   9494 def pivot_table(
   (...)
   9505     sort: bool = True,
   9506 ) -> DataFrame:
   9507     from pandas.core.reshape.pivot import pivot_table
-> 9509     return pivot_table(
   9510         self,
   9511         values=values,
   9512         index=index,
   9513         columns=columns,
   9514         aggfunc=aggfunc,
   9515         fill_value=fill_value,
   9516         margins=margins,
   9517         dropna=dropna,
   9518         margins_name=margins_name,
   9519         observed=observed,
   9520         sort=sort,
   9521     )

File c:\Python312\Lib\site-packages\pandas\core\reshape\pivot.py:102, in pivot_table(data, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name, observed, sort)
     99     table = concat(pieces, keys=keys, axis=1)
    100     return table.__finalize__(data, method="pivot_table")
--> 102 table = __internal_pivot_table(
    103     data,
    104     values,
    105     index,
    106     columns,
    107     aggfunc,
    108     fill_value,
    109     margins,
    110     dropna,
    111     margins_name,
    112     observed,
    113     sort,
    114 )
    115 return table.__finalize__(data, method="pivot_table")

File c:\Python312\Lib\site-packages\pandas\core\reshape\pivot.py:172, in __internal_pivot_table(data, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name, observed, sort)
    169     values = list(values)
    171 observed_bool = False if observed is lib.no_default else observed
--> 172 grouped = data.groupby(keys, observed=observed_bool, sort=sort, dropna=dropna)
    173 if observed is lib.no_default and any(
    174     ping._passed_categorical for ping in grouped._grouper.groupings
    175 ):
    176     warnings.warn(
    177         "The default value of observed=False is deprecated and will change "
    178         "to observed=True in a future version of pandas. Specify "
   (...)
    181         stacklevel=find_stack_level(),
    182     )

File c:\Python312\Lib\site-packages\pandas\core\frame.py:9183, in DataFrame.groupby(self, by, axis, level, as_index, sort, group_keys, observed, dropna)
   9180 if level is None and by is None:
   9181     raise TypeError("You have to supply one of 'by' and 'level'")
-> 9183 return DataFrameGroupBy(
   9184     obj=self,
   9185     keys=by,
   9186     axis=axis,
   9187     level=level,
   9188     as_index=as_index,
   9189     sort=sort,
   9190     group_keys=group_keys,
   9191     observed=observed,
   9192     dropna=dropna,
   9193 )

File c:\Python312\Lib\site-packages\pandas\core\groupby\groupby.py:1329, in GroupBy.__init__(self, obj, keys, axis, level, grouper, exclusions, selection, as_index, sort, group_keys, observed, dropna)
   1326 self.dropna = dropna
   1328 if grouper is None:
-> 1329     grouper, exclusions, obj = get_grouper(
   1330         obj,
   1331         keys,
   1332         axis=axis,
   1333         level=level,
   1334         sort=sort,
   1335         observed=False if observed is lib.no_default else observed,
   1336         dropna=self.dropna,
   1337     )
   1339 if observed is lib.no_default:
   1340     if any(ping._passed_categorical for ping in grouper.groupings):

File c:\Python312\Lib\site-packages\pandas\core\groupby\grouper.py:1043, in get_grouper(obj, key, axis, level, sort, observed, validate, dropna)
   1041         in_axis, level, gpr = False, gpr, None
   1042     else:
-> 1043         raise KeyError(gpr)
   1044 elif isinstance(gpr, Grouper) and gpr.key is not None:
   1045     # Add key to exclusions
   1046     exclusions.add(gpr.key)

KeyError: 'YEAR'

11. Boxplot: Salary by Remote Type (AI vs. Non-AI)

import plotly.express as px

fig = px.box(
    df,
    x='REMOTE_TYPE_NAME',
    y='SALARY',
    color='IS_AI_CAREER',
    title='Salary Distribution by Remote Type (AI vs. Non-AI Careers)',
    labels={
        'REMOTE_TYPE_NAME': 'Remote Type',
        'SALARY': 'Salary',
        'IS_AI_CAREER': 'AI Career (1 = AI, 0 = Non-AI)'
    },
    height=600,
    width=1000
)

fig.update_layout(
    boxmode='group',  # Grouped by hue (IS_AI_CAREER)
    xaxis_tickangle=0
)

fig.show()
Unable to display output for mime type(s): application/vnd.plotly.v1+json

12. Average Salary by State for AI Jobs

import plotly.express as px

# Prepare data
avg_salary_state = (
    df[df['IS_AI_CAREER'] == 1]
    .groupby('STATE_NAME')['SALARY']
    .mean()
    .sort_values(ascending=False)
    .reset_index()
)

# Create interactive bar chart
fig = px.bar(
    avg_salary_state,
    x='STATE_NAME',
    y='SALARY',
    title='Average AI Career Salary by State',
    labels={'STATE_NAME': 'State', 'SALARY': 'Average Salary'},
    height=600,
    width=1000
)

fig.update_layout(xaxis_tickangle=45)
fig.show()
Unable to display output for mime type(s): application/vnd.plotly.v1+json

13. Count of AI vs. Non-AI Jobs by State (Top 15 States)

import plotly.express as px
import pandas as pd

# Group and prepare data
ai_counts = df.groupby(['STATE_NAME', 'IS_AI_CAREER']).size().unstack(fill_value=0)

# Get top 15 states by total job count
top_states = ai_counts.sum(axis=1).sort_values(ascending=False).head(15).index

# Filter and reset for Plotly format
plot_data = ai_counts.loc[top_states].reset_index().melt(id_vars='STATE_NAME', var_name='IS_AI_CAREER', value_name='Job Count')

# Plotly stacked bar chart
fig = px.bar(
    plot_data,
    x='STATE_NAME',
    y='Job Count',
    color='IS_AI_CAREER',
    title='Job Count: AI vs. Non-AI by State (Top 15 States)',
    labels={'STATE_NAME': 'State', 'IS_AI_CAREER': 'Career Type'},
    barmode='stack',
    height=600,
    width=1000
)

fig.update_layout(xaxis_tickangle=45)
fig.show()
Unable to display output for mime type(s): application/vnd.plotly.v1+json

14. Top 10 Highest-Paying AI Job Titles

import plotly.express as px

# Prepare data
top_ai_titles = (
    df[df['IS_AI_CAREER'] == 1]
    .groupby('TITLE_CLEAN')['SALARY']
    .mean()
    .sort_values(ascending=False)
    .head(10)
    .reset_index()
)

# Create interactive bar chart
fig = px.bar(
    top_ai_titles,
    x='TITLE_CLEAN',
    y='SALARY',
    title='Top 10 Highest-Paying AI Job Titles',
    labels={'TITLE_CLEAN': 'Job Title', 'SALARY': 'Average Salary'},
    height=800
)

fig.update_layout(xaxis_tickangle=45)
fig.show()
Unable to display output for mime type(s): application/vnd.plotly.v1+json

15. Salary Distribution by Industry (NAICS2_NAME)

import plotly.express as px

fig = px.box(
    df,
    x='NAICS2_NAME',
    y='SALARY',
    title='Salary Distribution by Industry',
    labels={'NAICS2_NAME': 'Industry', 'SALARY': 'Salary'},
    height=1500,
    width=1200
)

fig.update_layout(
    xaxis_tickangle=90,
    boxmode='group'
)

fig.show()
Unable to display output for mime type(s): application/vnd.plotly.v1+json

16. Industry-wise AI vs. Non-AI Job Count

import plotly.express as px
import pandas as pd

# Group and prepare data
industry_counts = df.groupby(['NAICS2_NAME', 'IS_AI_CAREER']).size().unstack(fill_value=0)

# Get top 10 industries by total job count
top_industries = industry_counts.sum(axis=1).sort_values(ascending=False).head(10).index

# Reshape for Plotly
plot_data = industry_counts.loc[top_industries].reset_index().melt(
    id_vars='NAICS2_NAME',
    var_name='IS_AI_CAREER',
    value_name='Job Count'
)

# Create stacked bar chart
fig = px.bar(
    plot_data,
    x='NAICS2_NAME',
    y='Job Count',
    color='IS_AI_CAREER',
    barmode='stack',
    title='AI vs. Non-AI Job Count by Industry (Top 10)',
    labels={'NAICS2_NAME': 'Industry', 'IS_AI_CAREER': 'Career Type'},
    height=600,
    width=1000
)

fig.update_layout(xaxis_tickangle=45)
fig.show()
Unable to display output for mime type(s): application/vnd.plotly.v1+json

17. Average Salary by Original Pay Period (AI vs. Non-AI)

import plotly.express as px

import plotly.io as pio

# Tell Plotly to use Colab's renderer
pio.renderers.default = 'colab'

# Group and calculate average salary
avg_salary = (
    df.groupby(['ORIGINAL_PAY_PERIOD', 'IS_AI_CAREER'])['SALARY']
    .mean()
    .reset_index()
)

# Create interactive grouped bar chart
fig = px.bar(
    avg_salary,
    x='ORIGINAL_PAY_PERIOD',
    y='SALARY',
    color='IS_AI_CAREER',
    barmode='group',
    title='Average Salary by Pay Period (AI vs. Non-AI Careers)',
    labels={
        'ORIGINAL_PAY_PERIOD': 'Pay Period',
        'SALARY': 'Average Salary',
        'IS_AI_CAREER': 'AI Career (1 = AI, 0 = Non-AI)'
    },
    height=500,
    color_discrete_map={
        1: '#007acc',  # AI Career - Professional blue
        0: '#999999'   # Non-AI Career - Neutral gray
    }
)

fig.update_layout(
    xaxis_tickangle=0,
    template='plotly_white',
    legend_title='Career Type'
)

fig.show()

visualize the salary trend for AI vs. Non-AI jobs specifically in 2024,

df['POSTED'] = pd.to_datetime(df['POSTED'], errors='coerce')  # Ensure correct datetime format
df['YEAR'] = df['POSTED'].dt.year
df['MONTH'] = df['POSTED'].dt.month
df_2024 = df[df['YEAR'] == 2024]
trend_2024 = df_2024.groupby(['MONTH', 'IS_AI_CAREER'])['SALARY'].mean().reset_index()
import plotly.express as px

# Create the line plot
fig = px.line(
    trend_2024,
    x='MONTH',
    y='SALARY',
    color='IS_AI_CAREER',
    markers=True,
    title='Monthly Salary Trend in 2024: AI vs. Non-AI Careers',
    labels={
        'MONTH': 'Month',
        'SALARY': 'Average Salary',
        'IS_AI_CAREER': 'AI Career (1 = AI, 0 = Non-AI)'
    }
)

# Customize month ticks
fig.update_layout(
    xaxis=dict(
        tickmode='array',
        tickvals=list(range(1, 13)),
        ticktext=['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
    ),
    yaxis_title='Average Salary',
    height=500,
    width=900
)

fig.show()

How do salaries differ across AI vs. non-AI careers

import plotly.graph_objects as go

# Create violin plots for each group
fig = go.Figure()

fig.add_trace(go.Violin(
    x=df['IS_AI_CAREER'][df['IS_AI_CAREER'] == 0].map({0: 'Non-AI'}),
    y=df['SALARY'][df['IS_AI_CAREER'] == 0],
    name='Non-AI',
    box_visible=True,
    meanline_visible=True
))

fig.add_trace(go.Violin(
    x=df['IS_AI_CAREER'][df['IS_AI_CAREER'] == 1].map({1: 'AI'}),
    y=df['SALARY'][df['IS_AI_CAREER'] == 1],
    name='AI',
    box_visible=True,
    meanline_visible=True
))

# Update layout
fig.update_layout(
    title='Salary Distribution: AI vs. Non-AI Careers',
    xaxis_title='Career Type',
    yaxis_title='Salary',
    height=500,
    width=700
)

fig.show()
import plotly.express as px

fig = px.histogram(
    df,
    x='SALARY',
    color='IS_AI_CAREER',
    nbins=40,
    barmode='overlay',
    opacity=0.5,
    title='Salary Distribution: AI vs. Non-AI Careers',
    labels={
        'SALARY': 'Salary',
        'IS_AI_CAREER': 'Career Type (0 = Non-AI, 1 = AI)'
    },
    height=500
)

fig.update_layout(
    legend_title_text='Career Type',
    xaxis_title='Salary',
    yaxis_title='Number of Job Postings'
)

fig.show()
import plotly.express as px
import pandas as pd

# Prepare salary mean and median data
salary_means = df.groupby('IS_AI_CAREER')['SALARY'].mean().reset_index()
salary_means['Metric'] = 'Mean'

salary_medians = df.groupby('IS_AI_CAREER')['SALARY'].median().reset_index()
salary_medians['Metric'] = 'Median'

# Combine for optional side-by-side
salary_combined = pd.concat([salary_means, salary_medians])
salary_combined['Career Type'] = salary_combined['IS_AI_CAREER'].map({0: 'Non-AI', 1: 'AI'})

# Plot Mean Salary
fig_mean = px.bar(
    salary_means,
    x='IS_AI_CAREER',
    y='SALARY',
    color='IS_AI_CAREER',
    title='Average Salary: AI vs. Non-AI Careers',
    labels={'IS_AI_CAREER': 'Career Type', 'SALARY': 'Average Salary'},
    color_discrete_sequence=['skyblue', 'orange'],
    height=400,
    width=500
)
fig_mean.update_layout(
    xaxis=dict(
        tickmode='array',
        tickvals=[0, 1],
        ticktext=['Non-AI', 'AI']
    ),
    showlegend=False
)
fig_mean.show()

# Plot Median Salary
fig_median = px.bar(
    salary_medians,
    x='IS_AI_CAREER',
    y='SALARY',
    color='IS_AI_CAREER',
    title='Median Salary: AI vs. Non-AI Careers',
    labels={'IS_AI_CAREER': 'Career Type', 'SALARY': 'Median Salary'},
    color_discrete_sequence=['lightgreen', 'salmon'],
    height=400,
    width=500
)
fig_median.update_layout(
    xaxis=dict(
        tickmode='array',
        tickvals=[0, 1],
        ticktext=['Non-AI', 'AI']
    ),
    showlegend=False
)
fig_median.show()

Are remote jobs better paying than in-office roles?

1. Boxplot: Salary by Remote Type

import plotly.express as px

# Create interactive boxplot
fig = px.box(
    df,
    x='REMOTE_TYPE_NAME',
    y='SALARY',
    title='Salary Comparison: Remote vs. In-Office vs. Hybrid Jobs',
    labels={'REMOTE_TYPE_NAME': 'Job Type', 'SALARY': 'Salary'},
    points=False,  # Equivalent to showfliers=False
    height=500,
    width=800
)

fig.show()

2. Violin Plot: Salary by Remote Type

import plotly.graph_objects as go

# Get unique remote job types
remote_types = df['REMOTE_TYPE_NAME'].dropna().unique()

# Create violin traces for each remote type
fig = go.Figure()

for job_type in remote_types:
    fig.add_trace(go.Violin(
        x=[job_type] * len(df[df['REMOTE_TYPE_NAME'] == job_type]),
        y=df[df['REMOTE_TYPE_NAME'] == job_type]['SALARY'],
        name=job_type,
        box_visible=True,
        meanline_visible=True,
        points=False  # Set to 'all' if you want to show individual data points
    ))

# Update layout
fig.update_layout(
    title='Salary Distribution: Remote vs. In-Office vs. Hybrid',
    xaxis_title='Job Type',
    yaxis_title='Salary',
    height=600,
    width=800
)

fig.show()

3. Bar Chart: Mean or Median Salary by Remote Type

import plotly.express as px

# Compute mean salary by job type
mean_salary = df.groupby('REMOTE_TYPE_NAME')['SALARY'].mean().sort_values(ascending=False).reset_index()

# Create bar chart with annotations
fig = px.bar(
    mean_salary,
    x='REMOTE_TYPE_NAME',
    y='SALARY',
    text='SALARY',  # Add value labels
    title='Average Salary by Job Type (Remote vs. In-Office vs. Hybrid)',
    labels={'REMOTE_TYPE_NAME': 'Job Type', 'SALARY': 'Average Salary'},
    color_discrete_sequence=['skyblue'],
    height=500,
    width=700
)

# Format salary annotations
fig.update_traces(texttemplate='$%{text:,.0f}', textposition='outside')

# Adjust layout
fig.update_layout(
    xaxis_tickangle=0,
    uniformtext_minsize=10,
    uniformtext_mode='hide'  # hide labels if they overlap
)

fig.show()

What industries saw the biggest wage growth in 2024?

1. Calculate Wage Growth Percent by Industry

df['POSTED'] = pd.to_datetime(df['POSTED'], errors='coerce')
df['YEAR'] = df['POSTED'].dt.year

# Group by industry and year, then get mean salary
industry_year = df[df['YEAR'].isin([2023, 2024])].groupby(['NAICS2_NAME', 'YEAR'])['SALARY'].mean().unstack()

# Calculate growth percentage
industry_year['WAGE_GROWTH_%'] = 100 * (industry_year[2024] - industry_year[2024]) / industry_year[2024]
# Remove industries without both years' data
industry_year = industry_year.dropna(subset=[2024, 2024])
df['POSTED'] = pd.to_datetime(df['POSTED'], errors='coerce')
df['YEAR'] = df['POSTED'].dt.year

# Group by industry and year, then get mean salary
industry_year = (
    df[df['YEAR'].isin([2023, 2024])]
    .groupby(['NAICS2_NAME', 'YEAR'])['SALARY']
    .mean()
    .unstack()
)

# Only keep industries with data for both years
industry_year = industry_year.dropna(subset=[2024, 2024])

# Calculate growth percentage safely
industry_year['WAGE_GROWTH_%'] = 100 * (industry_year[2024] - industry_year[2024]) / industry_year[2024]
print(industry_year.columns)
Index([2024, 'WAGE_GROWTH_%'], dtype='object', name='YEAR')
import plotly.express as px

# Compute top 10 industries by average salary for 2024
industry_salary_2024 = (
    df[df['YEAR'] == 2024]
    .groupby('NAICS2_NAME')['SALARY']
    .mean()
    .sort_values(ascending=False)
    .head(10)
    .reset_index()
)

# Create interactive bar chart with annotations
fig = px.bar(
    industry_salary_2024,
    x='NAICS2_NAME',
    y='SALARY',
    text='SALARY',  # This adds labels on top of bars
    title='Top 10 Highest Paying Industries (2024)',
    labels={'NAICS2_NAME': 'Industry', 'SALARY': 'Average Salary'},
    color_discrete_sequence=['pink'],
    height=650,
    width=900
)

# Format text labels (e.g., $120,000)
fig.update_traces(texttemplate='$%{text:,.0f}', textposition='outside')

fig.update_layout(
    xaxis_tickangle=45,
    uniformtext_minsize=10,
    uniformtext_mode='hide'  # hides overlapping labels
)

fig.show()
import plotly.express as px

industry_salary_2024 = (
    df[df['YEAR'] == 2024]
    .groupby('NAICS2_NAME')['SALARY']
    .mean()
    .sort_values(ascending=False)
    .head(10)
    .reset_index()
)

fig = px.bar(
    industry_salary_2024,
    x='NAICS2_NAME',
    y='SALARY',
    title='Top 10 Highest Paying Industries (2024)',
    labels={'NAICS2_NAME': 'Industry', 'SALARY': 'Average Salary'},
    color='SALARY',
    color_continuous_scale='Greens'
)

fig.update_layout(
    xaxis_tickangle=45,
    template='plotly_white'
)

fig.show()

Plotly Pie Chart (Interactive)

import plotly.express as px

# Prepare data
ai_counts = df['IS_AI_CAREER'].value_counts().reset_index()
ai_counts.columns = ['IS_AI_CAREER', 'Count']
ai_counts['Category'] = ai_counts['IS_AI_CAREER'].map({0: 'Non-AI', 1: 'AI'})

# Plot
fig = px.pie(
    ai_counts,
    values='Count',
    names='Category',
    title='AI vs. Non-AI Job Distribution',
    color_discrete_sequence=['skyblue', 'orange']
)

fig.update_traces(textinfo='percent+label')
fig.show()
import plotly.express as px

# Optional: remove rows with missing salary values
scatter_df = df.dropna(subset=['SALARY_FROM', 'SALARY_TO', 'IS_AI_CAREER'])

# Optional: limit outliers for clearer visualization
scatter_df = scatter_df[(scatter_df['SALARY_FROM'] < 200000) & (scatter_df['SALARY_TO'] < 250000)]

# Plot
fig = px.scatter(
    scatter_df,
    x='SALARY_FROM',
    y='SALARY_TO',
    color='IS_AI_CAREER',
    labels={'IS_AI_CAREER': 'AI Job'},
    title='Scatter Plot: Salary From vs. Salary To (AI vs. Non-AI)',
    hover_data=['TITLE_CLEAN', 'STATE_NAME'],
    color_discrete_map={0: 'skyblue', 1: 'orange'}
)

fig.update_layout(template='plotly_white')
fig.show()
fig = px.scatter(
    df[df['YEAR'] == 2024],
    x='POSTED',
    y='SALARY',
    color='IS_AI_CAREER',
    title='AI vs. Non-AI Salaries Over Time (2024)',
    labels={'IS_AI_CAREER': 'AI Job'},
    hover_data=['TITLE_CLEAN', 'STATE_NAME']
)
fig.update_layout(template='plotly_white')
fig.show()
# Regenerate the DataFrame correctly
df['POSTED'] = pd.to_datetime(df['POSTED'], errors='coerce')

daily_avg_salary = (
    df[df['POSTED'].dt.year == 2024]
    .groupby([df['POSTED'].dt.date, 'IS_AI_CAREER'])['SALARY']
    .mean()
    .reset_index()
)

daily_avg_salary.columns = ['Date', 'AI Job', 'Avg Salary']
import plotly.express as px

fig = px.line(
    daily_avg_salary,
    x='Date',
    y='Avg Salary',
    color='AI Job',
    color_discrete_map={0: 'blue', 1: 'orange'},
    labels={'AI Job': 'Career Type'},
    title='📈 Daily Average Salary Trend (AI vs Non-AI Jobs) - 2024'
)

fig.update_layout(
    xaxis_title='Date Posted',
    yaxis_title='Average Salary',
    legend_title='Job Type',
    template='plotly_white'
)
fig.show()

1. Multiple Linear Regression

%pip install scikit-learn
Note: you may need to restart the kernel to use updated packages.

[notice] A new release of pip is available: 24.0 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip
Requirement already satisfied: scikit-learn in c:\python312\lib\site-packages (1.6.1)
Requirement already satisfied: numpy>=1.19.5 in c:\python312\lib\site-packages (from scikit-learn) (2.1.3)
Requirement already satisfied: scipy>=1.6.0 in c:\python312\lib\site-packages (from scikit-learn) (1.15.2)
Requirement already satisfied: joblib>=1.2.0 in c:\python312\lib\site-packages (from scikit-learn) (1.4.2)
Requirement already satisfied: threadpoolctl>=3.1.0 in c:\python312\lib\site-packages (from scikit-learn) (3.6.0)
# 1. Import required libraries
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np

# 2. Load your dataset (replace this with your actual data)
# Example: df = pd.read_csv("lightcast_job_postings.csv")

# 3. Select features and target
features = ['STATE_NAME', 'TITLE_CLEAN', 'SKILLS_NAME']
target = 'SALARY'

# 4. Drop missing values in required columns
df_reg = df.dropna(subset=features + [target])

# 5. Define X and y
X = df_reg[features]
y = df_reg[target]

# 6. Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 7. Preprocessor: one-hot encode categorical features
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), features)
])

# 8. Build pipeline with preprocessing + Linear Regression
reg_model = Pipeline([
    ('preprocess', preprocessor),
    ('regressor', LinearRegression())
])

# 9. Fit the model
reg_model.fit(X_train, y_train)

# 10. Evaluate
train_score = reg_model.score(X_train, y_train)
test_score = reg_model.score(X_test, y_test)

print(f" Train R²: {train_score:.4f}")
print(f" Test R²: {test_score:.4f}")
 Train R²: 0.9808
 Test R²: 0.5143
from sklearn.metrics import mean_squared_error, r2_score

# Make predictions
y_pred = reg_model.predict(X_test)

# R² Score
r2 = r2_score(y_test, y_pred)

# MSE
mse = mean_squared_error(y_test, y_pred)

# RMSE
rmse = np.sqrt(mse)

# Print evaluation metrics
print("✅ Train R²:", reg_model.score(X_train, y_train))
print("✅ Test R²:", r2)
print(f"📉 MSE: {mse:,.2f}")
print(f"📉 RMSE: {rmse:,.2f}")
✅ Train R²: 0.9808207991321921
✅ Test R²: 0.5143198085664493
📉 MSE: 417,140,293.37
📉 RMSE: 20,424.01
# Keep top 50 titles
top_titles = df_reg['TITLE_CLEAN'].value_counts().nlargest(50).index
df_reg['TITLE_CLEAN'] = df_reg['TITLE_CLEAN'].where(df_reg['TITLE_CLEAN'].isin(top_titles), 'Other')

# Keep top 50 skills
top_skills = df_reg['SKILLS_NAME'].value_counts().nlargest(50).index
df_reg['SKILLS_NAME'] = df_reg['SKILLS_NAME'].where(df_reg['SKILLS_NAME'].isin(top_skills), 'Other')
import plotly.express as px
import pandas as pd

# Get predictions
y_pred = reg_model.predict(X_test)

# Create a DataFrame to plot
results_df = pd.DataFrame({
    'Actual Salary': y_test,
    'Predicted Salary': y_pred
})

# Plot using Plotly
fig = px.scatter(
    results_df,
    x='Actual Salary',
    y='Predicted Salary',
    title='Predicted vs Actual Salary (Plotly)',
    labels={'x': 'Actual Salary', 'y': 'Predicted Salary'},
    opacity=0.6
)

# Add reference line (y = x)
fig.add_shape(
    type='line',
    x0=results_df['Actual Salary'].min(),
    y0=results_df['Actual Salary'].min(),
    x1=results_df['Actual Salary'].max(),
    y1=results_df['Actual Salary'].max(),
    line=dict(color='red', dash='dash'),
)

fig.update_layout(template='plotly_white')
fig.show()

Ridge Regression

from sklearn.linear_model import Ridge

reg_model = Pipeline([
    ('preprocess', preprocessor),
    ('regressor', Ridge(alpha=1.0))
])
reg_model.fit(X_train, y_train)
print("Train R²:", reg_model.score(X_train, y_train))
print("Test R²:", reg_model.score(X_test, y_test))
Train R²: 0.9077647413163654
Test R²: 0.5418680264560262
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
from sklearn.pipeline import Pipeline

# Create and train Ridge Regression pipeline
reg_model = Pipeline([
    ('preprocess', preprocessor),
    ('regressor', Ridge(alpha=1.0))
])

reg_model.fit(X_train, y_train)

# Predict on test set
y_pred = reg_model.predict(X_test)

# Evaluate metrics
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

# Output results
print("✅ Train R²:", reg_model.score(X_train, y_train))
print("✅ Test R²:", r2)
print(f"📉 MSE: {mse:,.2f}")
print(f"📉 RMSE: {rmse:,.2f}")
✅ Train R²: 0.907764733484408
✅ Test R²: 0.5418666997879171
📉 MSE: 393,480,859.68
📉 RMSE: 19,836.35
import plotly.express as px
import pandas as pd

# Predict using your trained model
y_pred = reg_model.predict(X_test)

# Create DataFrame for plotting
results_df = pd.DataFrame({
    'Actual Salary': y_test,
    'Predicted Salary': y_pred
})

# Create scatter plot
fig = px.scatter(
    results_df,
    x='Actual Salary',
    y='Predicted Salary',
    title='Ridge Regression: Predicted vs Actual Salary',
    opacity=0.6,
    template='plotly_white'
)

# Add reference diagonal (perfect predictions)
fig.add_shape(
    type='line',
    x0=results_df['Actual Salary'].min(),
    y0=results_df['Actual Salary'].min(),
    x1=results_df['Actual Salary'].max(),
    y1=results_df['Actual Salary'].max(),
    line=dict(color='red', dash='dash'),
)

fig.update_layout(
    xaxis_title='Actual Salary',
    yaxis_title='Predicted Salary'
)

fig.show()

Random Forest

from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import numpy as np

# Select features and target
features = ['STATE_NAME', 'TITLE_CLEAN', 'SKILLS_NAME']
target = 'SALARY'

# Drop missing values
df_rf = df.dropna(subset=features + [target]).copy()

# Optionally reduce cardinality
top_titles = df_rf['TITLE_CLEAN'].value_counts().nlargest(50).index
df_rf['TITLE_CLEAN'] = df_rf['TITLE_CLEAN'].where(df_rf['TITLE_CLEAN'].isin(top_titles), 'Other')
top_skills = df_rf['SKILLS_NAME'].value_counts().nlargest(50).index
df_rf['SKILLS_NAME'] = df_rf['SKILLS_NAME'].where(df_rf['SKILLS_NAME'].isin(top_skills), 'Other')

# Define features and target
X = df_rf[features]
y = df_rf[target]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), features)
])

# Define Random Forest model pipeline
rf_model = Pipeline([
    ('preprocess', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train the model
rf_model.fit(X_train, y_train)
Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['STATE_NAME', 'TITLE_CLEAN',
                                                   'SKILLS_NAME'])])),
                ('regressor', RandomForestRegressor(random_state=42))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
from sklearn.metrics import r2_score, mean_squared_error

# Predict
y_pred = rf_model.predict(X_test)

# Metrics
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"R² Score: {r2:.4f}")
print(f"MSE: {mse:,.2f}")
print(f"RMSE: {rmse:,.2f}")
R² Score: 0.1253
MSE: 751,237,715.95
RMSE: 27,408.72
import plotly.express as px
import pandas as pd

y_pred = rf_model.predict(X_test)
results_df = pd.DataFrame({'Actual Salary': y_test, 'Predicted Salary': y_pred})

fig = px.scatter(
    results_df,
    x='Actual Salary',
    y='Predicted Salary',
    title='Random Forest: Predicted vs Actual Salary',
    opacity=0.6
)
fig.add_shape(
    type='line',
    x0=results_df['Actual Salary'].min(),
    y0=results_df['Actual Salary'].min(),
    x1=results_df['Actual Salary'].max(),
    y1=results_df['Actual Salary'].max(),
    line=dict(color='red', dash='dash')
)
fig.update_layout(template='plotly_white')
fig.show()

K-Means Clustering

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Prepare features for clustering (use code numbers, not names)
features = ['SOC_2021_2', 'NAICS2']  # Change as needed
df_cluster = df.dropna(subset=features).copy()

# Convert categorical codes to numbers (if needed)
for col in features:
    df_cluster[col] = LabelEncoder().fit_transform(df_cluster[col].astype(str))

X = df_cluster[features]
X_scaled = StandardScaler().fit_transform(X)

# Find the optimal number of clusters (Elbow Method)
inertia = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

plt.plot(range(2, 11), inertia, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method For Optimal k')
plt.show()

# Let's use 3 clusters (or your elbow result)
kmeans = KMeans(n_clusters=3, random_state=42)
df_cluster['Cluster'] = kmeans.fit_predict(X_scaled)

print(df_cluster.groupby('Cluster').size())

Cluster
0    49246
1    12019
2    11211
dtype: int64
import plotly.express as px
import pandas as pd

# Prepare elbow data
elbow_df = pd.DataFrame({
    'k': list(range(2, 11)),
    'inertia': inertia
})

# Create line plot
fig = px.line(
    elbow_df,
    x='k',
    y='inertia',
    markers=True,
    title='Elbow Method for Optimal k',
    labels={'k': 'Number of Clusters', 'inertia': 'Inertia'}
)

fig.show()
df_with_titles = df.merge(df_cluster[['ID', 'Cluster']], on='ID')
from sklearn.feature_extraction.text import TfidfVectorizer

# For each cluster, analyze keywords in job titles
for cluster_num in sorted(df_with_titles['Cluster'].unique()):
    cluster_data = df_with_titles[df_with_titles['Cluster'] == cluster_num]

    vectorizer = TfidfVectorizer(stop_words='english', max_features=20)
    tfidf_matrix = vectorizer.fit_transform(cluster_data['TITLE_CLEAN'].astype(str))

    print(f"\n🔹 Top Terms in Cluster {cluster_num}:")
    print(vectorizer.get_feature_names_out())

🔹 Top Terms in Cluster 0:
['analyst' 'analytics' 'architect' 'associate' 'business' 'cloud'
 'consultant' 'data' 'enterprise' 'functional' 'intelligence' 'lead'
 'management' 'manager' 'oracle' 'remote' 'sap' 'senior' 'specialist' 'sr']

🔹 Top Terms in Cluster 1:
['analyst' 'analytics' 'architect' 'business' 'career' 'cloud'
 'consultant' 'data' 'enterprise' 'erp' 'functional' 'ii' 'intelligence'
 'lead' 'oracle' 'path' 'remote' 'sap' 'senior' 'sr']

🔹 Top Terms in Cluster 2:
['analyst' 'analytics' 'architect' 'business' 'consultant' 'data'
 'engineer' 'enterprise' 'erp' 'functional' 'ii' 'intelligence' 'lead'
 'management' 'remote' 'sap' 'senior' 'solution' 'sr' 'systems']
pip install wordcloud
Requirement already satisfied: wordcloud in /usr/local/lib/python3.11/dist-packages (1.9.4)
Requirement already satisfied: numpy>=1.6.1 in /usr/local/lib/python3.11/dist-packages (from wordcloud) (2.0.2)
Requirement already satisfied: pillow in /usr/local/lib/python3.11/dist-packages (from wordcloud) (11.2.1)
Requirement already satisfied: matplotlib in /usr/local/lib/python3.11/dist-packages (from wordcloud) (3.10.0)
Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib->wordcloud) (1.3.2)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.11/dist-packages (from matplotlib->wordcloud) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.11/dist-packages (from matplotlib->wordcloud) (4.57.0)
Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib->wordcloud) (1.4.8)
Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.11/dist-packages (from matplotlib->wordcloud) (24.2)
Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib->wordcloud) (3.2.3)
Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.11/dist-packages (from matplotlib->wordcloud) (2.9.0.post0)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil>=2.7->matplotlib->wordcloud) (1.17.0)
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Combine titles for each cluster into one string
for cluster_num in sorted(df_with_titles['Cluster'].unique()):
    text = ' '.join(df_with_titles[df_with_titles['Cluster'] == cluster_num]['TITLE_CLEAN'].dropna().astype(str))

    # Create word cloud
    wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis').generate(text)

    # Plot
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'🧠 Word Cloud for Cluster {cluster_num}', fontsize=16)
    plt.show()
/usr/local/lib/python3.11/dist-packages/IPython/core/pylabtools.py:151: UserWarning:

Glyph 129504 (\N{BRAIN}) missing from font(s) DejaVu Sans.

/usr/local/lib/python3.11/dist-packages/IPython/core/pylabtools.py:151: UserWarning:

Glyph 129504 (\N{BRAIN}) missing from font(s) DejaVu Sans.

/usr/local/lib/python3.11/dist-packages/IPython/core/pylabtools.py:151: UserWarning:

Glyph 129504 (\N{BRAIN}) missing from font(s) DejaVu Sans.